import pandas as pd # To read the data set
import numpy as np # Importing numpy library
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt # Necessary library for plotting graphs
%matplotlib inline
sns.set(color_codes = True)
from sklearn import metrics # Importing metrics
from sklearn.model_selection import train_test_split # Splitting data into train and test set
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler # Importing to standardize the data
from sklearn.impute import SimpleImputer # Importing to fill in zero values in the data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures # Importing polynomial features library
from sklearn.decomposition import PCA # Importing to run pca analysis on data
from sklearn.model_selection import KFold, cross_val_score # Importing kfold for cross validation
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # Importing for hypertuning model
from sklearn.cluster import KMeans # For KMeans cluster model building
from scipy.stats import zscore # Import zscore library
from scipy.spatial.distance import cdist # Importing cdist functionality for elbow graph
import tensorflow # Importing tensorflow library
from tensorflow.keras.models import Sequential # Importing tensorflow library
from tensorflow.keras.utils import to_categorical # Importing tensorflow library
from tensorflow.keras import optimizers # Importing optimizers
from tensorflow.keras.layers import Dense, Dropout, Activation # Importing necessary libraries
from skimage.color import rgb2gray # Loading color library
from sklearn.preprocessing import OneHotEncoder # Library for one hot encoding
from sklearn.metrics import confusion_matrix # Loading necessary library
from tensorflow.keras.preprocessing.image import ImageDataGenerator # Loading image generator
from tensorflow import keras # Loading keras libaray
df = pd.read_csv('Part- 1,2&3 - Signal.csv')
df.head()
df.tail()
df.shape
df.size
df.isnull().sum()
df.dtypes
df.info()
1.The dataset consists of (1599 entries & 12 columns).
2.On checking for lapses in the dataset we can conclude by saying that the data does not have any null values & does not have any major cleaning that needs to be done.
plt.figure(figsize=(14,9))
sns.boxplot(data = df, orient = 'h', palette = 'Set1', dodge = False);
Observation:
From the above boxplot we can see that there are outliers are present in mostly all columns. I will be finding the outliers counts in individual attributes analysis and fixing the outliers after visualization and analysis of each attribute.
# Plotting a visual analysis of parameter 1
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 1'], ax = ax1, color = 'red')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 1', fontsize = 15)
sns.boxplot(df['Parameter 1'], ax = ax2, color = 'red')
ax2.set_title('Box Plot', fontsize = 15)
ax2.set_xlabel('Parameter 1', fontsize = 15)
# Checking outliers in parameter 1
outliers_cols0 = []
Q1 = df['Parameter 1'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 1'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para1 = Q1 - 1.5 * IQR # Lower range bound
UTV_para1 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 1 <', LTV_para1, 'and >', UTV_para1, 'are outliers')
print('Number of outliers in parameter 1 column below the lower whisker =', df[df['Parameter 1'] < (Q1 - (1.5*IQR))]['Parameter 1'].count())
print('Number of outliers in parameter 1 column above the upper whisker =', df[df['Parameter 1'] > (Q3 + (1.5*IQR))]['Parameter 1'].count())
outliers_cols0.append('Parameter 1')
upperLowerBound_Disct = {'Parameter 1' : UTV_para1}
Observation :
We can observe from the outlier analysis above that we have a total of 49 outliers in "Parameter 1" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 2
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 2'], ax = ax1, color = 'b')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 2', fontsize = 15)
sns.boxplot(df['Parameter 2'], ax = ax2, color = 'b')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 2', fontsize = 15)
# Checking outliers in parameter 2
outliers_cols1 = []
Q1 = df['Parameter 2'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 2'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para2 = Q1 - 1.5 * IQR # Lower range bound
UTV_para2 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 2 <', LTV_para2, 'and >', UTV_para2, 'are outliers')
print('Number of outliers in the parameter 2 column below the lower whisker =', df[df['Parameter 2'] < (Q1 - (1.5*IQR))]['Parameter 2'].count())
print('Number of outliers in the parameter 2 column above the upper whisker =', df[df['Parameter 2'] > (Q3 + (1.5*IQR))]['Parameter 2'].count())
outliers_cols1.append('Parameter 2')
upperLowerBound_Disct = {'Parameter 2' : UTV_para2}
Observation :
We can observe from the outlier analysis above that we have a total of 19 outliers in "Parameter 2" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 3
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 3'], ax = ax1, color = 'green')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 3', fontsize = 15)
sns.boxplot(df['Parameter 3'], ax = ax2, color = 'green')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 3', fontsize = 15)
# Checking outliers in parameter 3
outliers_cols2 = []
Q1 = df['Parameter 3'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 3'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para3 = Q1 - 1.5 * IQR # Lower range bound
UTV_para3 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 3 <', LTV_para3, 'and >', UTV_para3, 'are outliers')
print('Number of outliers in the parameter 3 column below the lower whisker =', df[df['Parameter 3'] < (Q1 - (1.5*IQR))]['Parameter 3'].count())
print('Number of outliers in the parameter 3 column above the upper whisker =', df[df['Parameter 3'] > (Q3 + (1.5*IQR))]['Parameter 3'].count())
outliers_cols2.append('Parameter 3')
upperLowerBound_Disct = {'Parameter 3' : UTV_para3}
Observation :
We can observe from the outlier analysis above that we have a total of 1 outlier in "Parameter 3" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 4
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 4'], ax = ax1, color = 'purple')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 4', fontsize = 15)
sns.boxplot(df['Parameter 4'], ax = ax2, color = 'purple')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 4', fontsize = 15)
# Checking outliers in parameter 4
outliers_cols3 = []
Q1 = df['Parameter 4'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 4'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para4 = Q1 - 1.5 * IQR # Lower range bound
UTV_para4 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 4 <', LTV_para4, 'and >', UTV_para4, 'are outliers')
print('Number of outliers in the parameter 4 column below the lower whisker =', df[df['Parameter 4'] < (Q1 - (1.5*IQR))]['Parameter 4'].count())
print('Number of outliers in the parameter 4 column above the upper whisker =', df[df['Parameter 4'] > (Q3 + (1.5*IQR))]['Parameter 4'].count())
outliers_cols3.append('Parameter 4')
upperLowerBound_Disct = {'Parameter 4' : UTV_para4}
Observation :
We can observe from the outlier analysis above that we have a total of 155 outliers in "Parameter 4" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 5
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 5'], ax = ax1, color = 'orange')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 5', fontsize = 15)
sns.boxplot(df['Parameter 5'], ax = ax2, color = 'orange')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 5', fontsize = 15)
# Checking outliers in parameter 5
outliers_cols4 = []
Q1 = df['Parameter 5'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 5'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para5 = Q1 - 1.5 * IQR # Lower range bound
UTV_para5 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 5 <', LTV_para5, 'and >', UTV_para5, 'are outliers')
print('Number of outliers in the parameter 5 column below the lower whisker =', df[df['Parameter 5'] < (Q1 - (1.5*IQR))]['Parameter 5'].count())
print('Number of outliers in the parameter 5 column above the upper whisker =', df[df['Parameter 5'] > (Q3 + (1.5*IQR))]['Parameter 5'].count())
outliers_cols4.append('Parameter 5')
upperLowerBound_Disct = {'Parameter 5' : UTV_para5}
Observation :
We can observe from the outlier analysis above that we have a total of 112 outlier in "Parameter 5", 9 towards the lower whisker and 103 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 6
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 6'], ax = ax1, color = 'black')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 6', fontsize = 15)
sns.boxplot(df['Parameter 6'], ax = ax2, color = 'black')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 6', fontsize = 15)
# Checking outliers in parameter 6
outliers_cols5 = []
Q1 = df['Parameter 6'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 6'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 #Interquartile range
LTV_para6 = Q1 - 1.5 * IQR # Lower range bound
UTV_para6 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 6 <', LTV_para6, ' and >', UTV_para6, 'are outliers')
print('Number of outliers in the parameter 6 column below the lower whisker =', df[df['Parameter 6'] < (Q1 - (1.5*IQR))]['Parameter 6'].count())
print('Number of outliers in the parameter 6 column above the upper whisker =', df[df['Parameter 6'] > (Q3 + (1.5*IQR))]['Parameter 6'].count())
outliers_cols5.append('Parameter 6')
upperLowerBand_Disct = {'Parameter 6' : UTV_para6}
Observation :
We can observe from the outlier analysis above that we have a total of 30 outliers in "Parameter 6" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 7
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 7'], ax = ax1, color = 'brown')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 7', fontsize = 15)
sns.boxplot(df['Parameter 7'], ax = ax2, color = 'brown')
ax2.set_title('Boxplot', fontsize= 15)
ax2.set_xlabel('Parameter 7', fontsize = 15)
# Checking outliers in parameter 7
outliers_cols6 = []
Q1 = df['Parameter 7'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 7'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para7 = Q1 - 1.5 * IQR # Lower range bound
UTV_para7 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 7 <', LTV_para7, 'and >', UTV_para7, 'are outliers')
print('Number of outliers in the parameter 7 column below the lower whisker =', df[df['Parameter 7'] < (Q1 - (1.5*IQR))]['Parameter 7'].count())
print('Number of outliers in the parameter 7 column above the upper whisker =', df[df['Parameter 7'] > (Q3 + (1.5*IQR))]['Parameter 7'].count())
outliers_cols6.append('Parameter 7')
upperLowerBound_Disct = {'Parameter 7' : UTV_para7}
Observation :
We can observe from the outlier analysis above that we have a total of 55 outliers in "Parameter 6" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 8
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 8'], ax = ax1, color = 'pink')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 8', fontsize = 15)
sns.boxplot(df['Parameter 8'], ax = ax2, color = 'pink')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 8', fontsize = 15)
# Checking outliers in parameter 8
outliers_cols7 = []
Q1 = df['Parameter 8'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 8'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para8 = Q1 - 1.5 * IQR # Lower range bound
UTV_para8 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 8 <', LTV_para8, 'and >', UTV_para8, 'are outliers')
print('Number of outliers in the parameter 8 column below the lower whisker =', df[df['Parameter 8'] < (Q1 - (1.5*IQR))]['Parameter 8'].count())
print('Number of outliers in the parameter 8 column below the upper whisker =', df[df['Parameter 8'] > (Q3 + (1.5*IQR))]['Parameter 8'].count())
outliers_cols7.append('Parameter 8')
upperLowerBound_Disct = {'Parameter 8' : UTV_para8}
Observation :
We can observe from the outlier analysis above that we have a total of 45 outlier in "Parameter 8", 21 towards the lower whisker and 24 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 9
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 9'], ax = ax1, color = 'grey')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 9', fontsize = 15)
sns.boxplot(df['Parameter 9'], ax = ax2, color = 'grey')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 9', fontsize = 15)
# Checking outliers in parameter 9
outliers_cols8 = []
Q1 = df['Parameter 9'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 9'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para9 = Q1 - 1.5 * IQR # Lower range bound
UTV_para9 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 9 <', LTV_para9, 'and >', UTV_para9, 'are outliers')
print('Number of outliers in the parameter 9 column below the lower whisker =', df[df['Parameter 9'] < (Q1 - (1.5*IQR))]['Parameter 9'].count())
print('Number of outliers in the parameter 9 column below the upper whisker =', df[df['Parameter 9'] > (Q3 + (1.5*IQR))]['Parameter 9'].count())
outliers_cols8.append('Parameter 9')
upperLowerBound_Disct = {'Parameter 9' : UTV_para9}
Observation :
We can observe from the outlier analysis above that we have a total of 35 outlier in "Parameter 9", 14 towards the lower whisker and 21 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 10
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 10'], ax = ax1, color = 'gold')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 10', fontsize = 15)
sns.boxplot(df['Parameter 10'], ax = ax2, color = 'gold')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 10', fontsize = 15)
# Checking outliers in parameter 10
outliers_cols9 = []
Q1 = df['Parameter 10'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 10'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para10 = Q1 - 1.5 * IQR # Lower range bound
UTV_para10 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 10 <', LTV_para10, 'and >', UTV_para10, 'are outliers')
print('Number of outliers in the parameter 10 column below the lower whisker =', df[df['Parameter 10'] < (Q1 - (1.5*IQR))]['Parameter 10'].count())
print('Number of outliers in the parameter 10 column below the upper whisker =', df[df['Parameter 10'] > (Q3 + (1.5*IQR))]['Parameter 10'].count())
outliers_cols9.append('Parameter 10')
upperLowerBound_Disct = {'Parameter 10' : UTV_para10}
Observation :
We can observe from the outlier analysis above that we have a total of 59 outliers in "Parameter 10" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 11
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 11'], ax = ax1, color = 'white')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 11', fontsize = 15)
sns.boxplot(df['Parameter 11'], ax = ax2, color = 'white')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 11', fontsize = 15)
# Checking outliers in parameter 11
outliers_cols10 = []
Q1 = df['Parameter 11'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 11'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para11 = Q1 - 1.5 * IQR # Lower range bound
UTV_para11 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 11 <', LTV_para11, 'and >', UTV_para11, 'are outliers')
print('Number of outliers in the parameter 11 column below the lower whisker =', df[df['Parameter 11'] < (Q1 - (1.5*IQR))]['Parameter 11'].count())
print('Number of outliers in the parameter 11 column below the upper whisker =', df[df['Parameter 11'] > (Q3 + (1.5*IQR))]['Parameter 11'].count())
outliers_cols10.append('Parameter 11')
upperLowerBound_Disct = {'Parameter 11' : UTV_para11}
Observation :
We can observe from the outlier analysis above that we have a total of 13 outliers in "Parameter 11" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of signal_strength
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Signal_Strength'], ax = ax1, color = 'yellow')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Signal_Strength', fontsize = 15)
sns.boxplot(df['Signal_Strength'], ax = ax2, color = 'yellow')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Signal_Strength', fontsize = 15)
# Checking outliers in signal_strength
outliers_cols11 = []
Q1 = df['Signal_Strength'].quantile(0.25) # 1st quartile
Q3 = df['Signal_Strength'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para12 = Q1 - 1.5 * IQR # Lower range bound
UTV_para12 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Signal_Strength <', LTV_para12, 'and >', UTV_para12, 'are outliers')
print('Number of outliers in the Signal_Strength column below the lower whisker =', df[df['Signal_Strength'] < (Q1 - (1.5*IQR))]['Signal_Strength'].count())
print('Number of outliers in the Signal_Strength column below the upper whisker =', df[df['Signal_Strength'] > (Q3 + (1.5*IQR))]['Signal_Strength'].count())
outliers_cols11.append('Signal_Strength')
upperLowerBound_Disct = {'Signal_Strength' : UTV_para12}
Observation :
We can observe from the outlier analysis above that we have a total of 28 outlier in "Signal_Strength", 10 towards the lower whisker and 18 towards the upper whisker. We will treat them later on.
# Pairplot visual analysis to check correlation amongst different fields
sns.pairplot(df, diag_kind = 'kde');
df.corr()
plt.figure(figsize = (18,12))
sns.heatmap(df.corr(), annot = True, fmt = 'g');
Observation : From the above pair plot & heatmap we can infer the relationship amongst the attributes and target column as follows:
------------------------ Fixing Outliers ------------------------
# Showing the columns where outliers exist
print('These are the columns which have outliers : \n\n', outliers_cols0, outliers_cols1, outliers_cols3, outliers_cols4, outliers_cols5, outliers_cols6, outliers_cols7, outliers_cols8, outliers_cols9, outliers_cols10, outliers_cols11)
df_new = df.copy()
# Treating outliers present in respective columns
for col_name in df_new.columns[:]:
q1 = df_new[col_name].quantile(0.25)
q3 = df_new[col_name].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5*iqr
high = q3 + 1.5*iqr
df_new.loc[(df_new[col_name] < low) | (df_new[col_name] > high), col_name] = df_new[col_name].median()
plt.figure(figsize=(15,8))
sns.boxplot(data = df_new, orient='h', palette='Set1', dodge=False);
Observation :
Now we can see from the above visual analysis of boxplots that most of the outliers are replaced with their median. We can see that most of the outliers are removed, but because of their gaussians replacing it with median values, the attributes raised with new outliers which we can ignore.
df_new.shape
df_new.size
df_new.head()
df_new.count(axis = 0)
df_new.insert(df_new.shape[-1]-1,'Parameter 2,3,9 & 10',df_new['Parameter 9']/(df_new['Parameter 2'] + df_new['Parameter 3'] + df_new['Parameter 10']))
df_new.head()
df_new.insert(df_new.shape[-1]-1, 'Parameter 1 & 11', df_new['Parameter 11']/df_new['Parameter 1'])
df_new.head()
df_new.insert(df_new.shape[-1]-1,'Parameter 4,5 & 8',df_new['Parameter 4']/(df_new['Parameter 8'] + df_new['Parameter 5']))
df_new.head()
df_new.shape
df_new.isnull().sum()
df_new = df_new.apply(zscore)
df_new.head()
x = df_new.drop(['Signal_Strength'], axis = 1)
y = df_new[['Signal_Strength']]
pca_model = PCA(n_components = 14)
pca_model.fit(x)
plt.step(list(range(1,15)), np.cumsum(pca_model.explained_variance_ratio_), where = 'mid')
plt.ylabel('Cummulation of Variance Explained')
plt.xlabel('Eigen Values')
plt.show()
np.cumsum(pca_model.explained_variance_ratio_)
cluster = range(1,10)
mean_distortions = []
for val in cluster:
kmeans = KMeans(n_clusters = val)
kmeans.fit(df_new)
mean_distortions.append(sum(np.min(cdist(df_new, kmeans.cluster_centers_), axis = 1))/df_new.shape[0])
plt.plot(cluster, mean_distortions,'bx-')
plt.xlabel('No. Of Clusters')
plt.ylabel('Distortion')
plt.title('Elbow Method')
X_train1, X_test1, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) # Splitting Data
from sklearn.preprocessing import StandardScaler # Importing standard scaler library
X_train_sd = StandardScaler().fit_transform(X_train1)
X_test_sd = StandardScaler().fit_transform(X_test1)
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(X_train_sd.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
# The "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
Observation :
We can notice through pca that 11 parameters explain 95%-97% of the data.
# Values of eigen pairs
eigen_pairs = [(np.abs(e_vals[i]), e_vecs[:,i]) for i in range(len(e_vals))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:14]
# Generating dimensionally reduced datasets
w = np.hstack((eigen_pairs[0][1].reshape(14,1),
eigen_pairs[1][1].reshape(14,1)))
print('Matrix W:\n', w)
X_sd_pca = X_train_sd.dot(w)
X_test_sd_pca = X_test_sd.dot(w)
X_train_sd.shape, w.shape, X_sd_pca.shape, X_test_sd_pca.shape
X_sd_pca, X_train_sd
y_train
# SGD Neural Network regression model before pca
reg_model = Sequential()
# Input Layer
reg_model.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
sgd = optimizers.SGD(lr = 0.01)
reg_model.compile(optimizer = sgd, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model.summary()
history = reg_model.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
reg_model.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model = Sequential()
# Input Layer
model.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model.add(Dense(6, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
sgd1 = optimizers.SGD(lr = 0.01)
model.compile(optimizer = sgd1, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model.summary()
model.fit(X_train_sd, y_train, epochs = 100)
model.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model1 = Sequential()
# Input Layer
model1.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model1.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model1.add(Dense(12, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model1.add(Dense(8, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model1.add(Dense(6, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model1.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
sgd2 = optimizers.SGD(lr = 0.01)
model1.compile(optimizer = sgd2, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model1.summary()
model1.fit(X_train_sd, y_train, epochs = 100)
model1.evaluate(X_train_sd, y_train)
# Adam Neural Network regression model before pca
reg_model1 = Sequential()
# Input Layer
reg_model1.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model1.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
adam = optimizers.Adam(lr = 0.01)
reg_model1.compile(optimizer = adam, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model1.summary()
history1 = reg_model1.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
reg_model1.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model2 = Sequential()
# Input Layer
model2.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model2.add(Dense(10, activation='elu', kernel_initializer = 'normal')) # 2nd layer
model2.add(Dense(4, activation='elu', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model2.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
adam1 = optimizers.Adam(lr = 0.01)
model2.compile(optimizer = adam1, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model2.summary()
model2.fit(X_train_sd, y_train, epochs = 100)
model2.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model3 = Sequential()
# Input Layer
model3.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model3.add(Dense(10, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model3.add(Dense(4, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
model3.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 4th layer
model3.add(Dense(12, activation='tanh', kernel_initializer = 'normal')) # 5th layer
#Output layer
model3.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
adam2 = optimizers.Adam(lr = 0.01)
model3.compile(optimizer = adam2, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model3.summary()
model3.fit(X_train_sd, y_train, epochs = 100)
model3.evaluate(X_train_sd, y_train)
# Rmsprop Neural Network regression model before pca
reg_model2 = Sequential()
# Input Layer
reg_model2.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model2.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
rms = optimizers.RMSprop(lr = 0.01)
reg_model2.compile(optimizer = rms, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model2.summary()
history2 = reg_model2.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
reg_model2.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model4 = Sequential()
# Input Layer
model4.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model4.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model4.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model4.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
rms1 = optimizers.RMSprop(lr = 0.01)
model4.compile(optimizer = rms1, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model4.summary()
model4.fit(X_train_sd, y_train, epochs = 100)
model4.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model5 = Sequential()
# Input Layer
model5.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model5.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model5.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model5.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model5.add(Dense(20, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model5.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
rms2 = optimizers.RMSprop(lr = 0.01)
model5.compile(optimizer = rms2, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model5.summary()
model5.fit(X_train_sd, y_train, epochs = 100)
model5.evaluate(X_train_sd, y_train)
# SGD Neural Network regression model after pca
reg_model3 = Sequential()
# Input Layer
reg_model3.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model3.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
sgd3 = optimizers.SGD(lr = 0.01)
reg_model3.compile(optimizer = sgd3, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model3.summary()
history3 = reg_model3.fit(X_sd_pca, y_train, epochs = 100)
reg_model3.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model6 = Sequential()
# Input Layer
model6.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model6.add(Dense(6, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model6.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model6.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
sgd4 = optimizers.SGD(lr = 0.01)
model6.compile(optimizer = sgd4, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model6.summary()
model6.fit(X_sd_pca, y_train, epochs = 100)
model6.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model7 = Sequential()
# Input Layer
model7.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model7.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model7.add(Dense(12, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model7.add(Dense(8, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model7.add(Dense(6, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model7.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
sgd5 = optimizers.SGD(lr = 0.01)
model7.compile(optimizer = sgd5, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model7.summary()
model7.fit(X_sd_pca, y_train, epochs = 100)
model7.evaluate(X_sd_pca, y_train)
# Adam Neural Network regression model after pca
reg_model4 = Sequential()
# Input Layer
reg_model4.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model4.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
adam3 = optimizers.Adam(lr = 0.01)
reg_model4.compile(optimizer = adam3, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model4.summary()
history4 = reg_model4.fit(X_sd_pca, y_train, epochs = 100)
reg_model4.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model8 = Sequential()
# Input Layer
model8.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model8.add(Dense(10, activation='elu', kernel_initializer = 'normal')) # 2nd layer
model8.add(Dense(4, activation='elu', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model8.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
adam4 = optimizers.Adam(lr = 0.01)
model8.compile(optimizer = adam4, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model8.summary()
model8.fit(X_sd_pca, y_train, epochs = 100)
model8.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model9 = Sequential()
# Input Layer
model9.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model9.add(Dense(10, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model9.add(Dense(4, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
model9.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 4th layer
model9.add(Dense(12, activation='tanh', kernel_initializer = 'normal')) # 5th layer
#Output layer
model9.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
adam5 = optimizers.Adam(lr = 0.01)
model9.compile(optimizer = adam5, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model9.summary()
model9.fit(X_sd_pca, y_train, epochs = 100)
model9.evaluate(X_sd_pca, y_train)
# Rmsprop Neural Network regression model after pca
reg_model5 = Sequential()
# Input Layer
reg_model5.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
reg_model5.add(Dense(1, kernel_initializer = 'normal', activation = 'softmax'))
rms3 = optimizers.RMSprop(lr = 0.01)
reg_model5.compile(optimizer = rms3, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
reg_model5.summary()
reg_model5.fit(X_sd_pca, y_train, epochs = 100)
reg_model5.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model10 = Sequential()
# Input Layer
model10.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model10.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model10.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model10.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
rms4 = optimizers.RMSprop(lr = 0.01)
model10.compile(optimizer = rms4, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model10.summary()
model10.fit(X_sd_pca, y_train, epochs = 100)
model10.evaluate(X_sd_pca, y_train)
# Initialize Sequential model
model11 = Sequential()
# Input Layer
model11.add(Dense (1, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model11.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model11.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model11.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model11.add(Dense(20, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model11.add(Dense(1, activation='softmax', kernel_initializer = 'normal'))
rms5 = optimizers.RMSprop(lr = 0.01)
model11.compile(optimizer = rms5, loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model11.summary()
model11.fit(X_sd_pca, y_train, epochs = 100)
model11.evaluate(X_sd_pca, y_train)
df = pd.read_csv('Part- 1,2&3 - Signal.csv')
df.head()
df.tail()
df.shape
df.size
df.isnull().sum()
df.dtypes
1.The dataset consists of (1599 entries & 12 columns).
2.On checking for lapses in the dataset we can conclude by saying that the data does not have any null values & does not have any major cleaning that needs to be done.
df.info()
plt.figure(figsize=(14,9))
sns.boxplot(data = df, orient = 'h', palette = 'Set1', dodge = False);
Observation:
From the above boxplot we can see that there are outliers are present in mostly all columns. I will be finding the outliers
# Plotting a visual analysis of parameter 1
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 1'], ax = ax1, color = 'red')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 1', fontsize = 15)
sns.boxplot(df['Parameter 1'], ax = ax2, color = 'red')
ax2.set_title('Box Plot', fontsize = 15)
ax2.set_xlabel('Parameter 1', fontsize = 15)
# Checking outliers in parameter 1
outlier_cols0 = []
Q1 = df['Parameter 1'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 1'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para1 = Q1 - 1.5 * IQR # Lower range bound
UTV_para1 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 1 <', LTV_para1, 'and >', UTV_para1, 'are outliers')
print('Number of outliers in parameter 1 column below the lower whisker =', df[df['Parameter 1'] < (Q1 - (1.5*IQR))]['Parameter 1'].count())
print('Number of outliers in parameter 1 column above the upper whisker =', df[df['Parameter 1'] > (Q3 + (1.5*IQR))]['Parameter 1'].count())
outlier_cols0.append('Parameter 1')
upperLowerBound_Disct = {'Parameter 1' : UTV_para1}
Observation :
We can observe from the outlier analysis above that we have a total of 49 outliers in "Parameter 1" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 2
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 2'], ax = ax1, color = 'b')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 2', fontsize = 15)
sns.boxplot(df['Parameter 2'], ax = ax2, color = 'b')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 2', fontsize = 15)
# Checking outliers in parameter 2
outlier_cols1 = []
Q1 = df['Parameter 2'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 2'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para2 = Q1 - 1.5 * IQR # Lower range bound
UTV_para2 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 2 <', LTV_para2, 'and >', UTV_para2, 'are outliers')
print('Number of outliers in the parameter 2 column below the lower whisker =', df[df['Parameter 2'] < (Q1 - (1.5*IQR))]['Parameter 2'].count())
print('Number of outliers in the parameter 2 column above the upper whisker =', df[df['Parameter 2'] > (Q3 + (1.5*IQR))]['Parameter 2'].count())
outlier_cols1.append('Parameter 2')
upperLowerBound_Disct = {'Parameter 2' : UTV_para2}
Observation :
We can observe from the outlier analysis above that we have a total of 19 outliers in "Parameter 2" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 3
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 3'], ax = ax1, color = 'green')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 3', fontsize = 15)
sns.boxplot(df['Parameter 3'], ax = ax2, color = 'green')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 3', fontsize = 15)
# Checking outliers in parameter 3
outlier_cols2 = []
Q1 = df['Parameter 3'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 3'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para3 = Q1 - 1.5 * IQR # Lower range bound
UTV_para3 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 3 <', LTV_para3, 'and >', UTV_para3, 'are outliers')
print('Number of outliers in the parameter 3 column below the lower whisker =', df[df['Parameter 3'] < (Q1 - (1.5*IQR))]['Parameter 3'].count())
print('Number of outliers in the parameter 3 column above the upper whisker =', df[df['Parameter 3'] > (Q3 + (1.5*IQR))]['Parameter 3'].count())
outlier_cols2.append('Parameter 3')
upperLowerBound_Disct = {'Parameter 3' : UTV_para3}
Observation :
We can observe from the outlier analysis above that we have a total of 1 outlier in "Parameter 3" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 4
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 4'], ax = ax1, color = 'purple')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 4', fontsize = 15)
sns.boxplot(df['Parameter 4'], ax = ax2, color = 'purple')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 4', fontsize = 15)
# Checking outliers in parameter 4
outlier_cols3 = []
Q1 = df['Parameter 4'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 4'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para4 = Q1 - 1.5 * IQR # Lower range bound
UTV_para4 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 4 <', LTV_para4, 'and >', UTV_para4, 'are outliers')
print('Number of outliers in the parameter 4 column below the lower whisker =', df[df['Parameter 4'] < (Q1 - (1.5*IQR))]['Parameter 4'].count())
print('Number of outliers in the parameter 4 column above the upper whisker =', df[df['Parameter 4'] > (Q3 + (1.5*IQR))]['Parameter 4'].count())
outlier_cols3.append('Parameter 4')
upperLowerBound_Disct = {'Parameter 4' : UTV_para4}
Observation :
We can observe from the outlier analysis above that we have a total of 155 outliers in "Parameter 4" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 5
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 5'], ax = ax1, color = 'orange')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 5', fontsize = 15)
sns.boxplot(df['Parameter 5'], ax = ax2, color = 'orange')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 5', fontsize = 15)
# Checking outliers in parameter 5
outlier_cols4 = []
Q1 = df['Parameter 5'].quantile(0.25) # 1st Quartile
Q3 = df['Parameter 5'].quantile(0.75) # 3rd Quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para5 = Q1 - 1.5 * IQR # Lower range bound
UTV_para5 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 5 <', LTV_para5, 'and >', UTV_para5, 'are outliers')
print('Number of outliers in the parameter 5 column below the lower whisker =', df[df['Parameter 5'] < (Q1 - (1.5*IQR))]['Parameter 5'].count())
print('Number of outliers in the parameter 5 column above the upper whisker =', df[df['Parameter 5'] > (Q3 + (1.5*IQR))]['Parameter 5'].count())
outlier_cols4.append('Parameter 5')
upperLowerBound_Disct = {'Parameter 5' : UTV_para5}
Observation :
We can observe from the outlier analysis above that we have a total of 112 outlier in "Parameter 5", 9 towards the lower whisker and 103 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 6
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 6'], ax = ax1, color = 'black')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 6', fontsize = 15)
sns.boxplot(df['Parameter 6'], ax = ax2, color = 'black')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 6', fontsize = 15)
# Checking outliers in parameter 6
outlier_cols5 = []
Q1 = df['Parameter 6'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 6'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 #Interquartile range
LTV_para6 = Q1 - 1.5 * IQR # Lower range bound
UTV_para6 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 6 <', LTV_para6, ' and >', UTV_para6, 'are outliers')
print('Number of outliers in the parameter 6 column below the lower whisker =', df[df['Parameter 6'] < (Q1 - (1.5*IQR))]['Parameter 6'].count())
print('Number of outliers in the parameter 6 column above the upper whisker =', df[df['Parameter 6'] > (Q3 + (1.5*IQR))]['Parameter 6'].count())
outlier_cols5.append('Parameter 6')
upperLowerBand_Disct = {'Parameter 6' : UTV_para6}
Observation :
We can observe from the outlier analysis above that we have a total of 30 outliers in "Parameter 6" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 7
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 7'], ax = ax1, color = 'brown')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 7', fontsize = 15)
sns.boxplot(df['Parameter 7'], ax = ax2, color = 'brown')
ax2.set_title('Boxplot', fontsize= 15)
ax2.set_xlabel('Parameter 7', fontsize = 15)
# Checking outliers in parameter 7
outlier_cols6 = []
Q1 = df['Parameter 7'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 7'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para7 = Q1 - 1.5 * IQR # Lower range bound
UTV_para7 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 7 <', LTV_para7, 'and >', UTV_para7, 'are outliers')
print('Number of outliers in the parameter 7 column below the lower whisker =', df[df['Parameter 7'] < (Q1 - (1.5*IQR))]['Parameter 7'].count())
print('Number of outliers in the parameter 7 column above the upper whisker =', df[df['Parameter 7'] > (Q3 + (1.5*IQR))]['Parameter 7'].count())
outlier_cols6.append('Parameter 7')
upperLowerBound_Disct = {'Parameter 7' : UTV_para7}
Observation :
We can observe from the outlier analysis above that we have a total of 55 outliers in "Parameter 6" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 8
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 8'], ax = ax1, color = 'pink')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 8', fontsize = 15)
sns.boxplot(df['Parameter 8'], ax = ax2, color = 'pink')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 8', fontsize = 15)
# Checking outliers in parameter 8
outlier_cols7 = []
Q1 = df['Parameter 8'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 8'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para8 = Q1 - 1.5 * IQR # Lower range bound
UTV_para8 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range = ', IQR)
print('Parameter 8 <', LTV_para8, 'and >', UTV_para8, 'are outliers')
print('Number of outliers in the parameter 8 column below the lower whisker =', df[df['Parameter 8'] < (Q1 - (1.5*IQR))]['Parameter 8'].count())
print('Number of outliers in the parameter 8 column above the upper whisker =', df[df['Parameter 8'] > (Q3 + (1.5*IQR))]['Parameter 8'].count())
outlier_cols7.append('Parameter 8')
upperLowerBound_Disct = {'Parameter 8' : UTV_para8}
Observation :
We can observe from the outlier analysis above that we have a total of 45 outlier in "Parameter 8", 21 towards the lower whisker and 24 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 9
fig,(ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 9'], ax = ax1, color = 'grey')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 9', fontsize = 15)
sns.boxplot(df['Parameter 9'], ax = ax2, color = 'grey')
ax1.set_title('Boxplot', fontsize = 15)
ax1.set_xlabel('Parameter 9', fontsize = 15)
# Checking outliers in parameter 9
outlier_cols8 = []
Q1 = df['Parameter 9'].quantile(0.25) # 1st quantile
Q3 = df['Parameter 9'].quantile(0.75) # 3rd quantile
IQR = Q3 - Q1 # Interquartile range
LTV_para9 = Q1 - 1.5 * IQR # Lower range bound
UTV_para9 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 9 <', LTV_para9, 'and >', UTV_para9, 'are outliers')
print('Number of outliers in the parameter 9 column below the lower whisker =', df[df['Parameter 9'] < (Q1 - (1.5*IQR))]['Parameter 9'].count())
print('Number of outliers in the parameter 9 column above the upper whisker =', df[df['Parameter 9'] > (Q3 + (1.5*IQR))]['Parameter 9'].count())
outlier_cols8.append('Parameter 9')
upperLowerBound_Disct = {'Parameter 9' : UTV_para9}
Observation :
We can observe from the outlier analysis above that we have a total of 35 outlier in "Parameter 9", 14 towards the lower whisker and 21 towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 10
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 10'], ax = ax1, color = 'gold')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 10', fontsize = 15)
sns.boxplot(df['Parameter 10'], ax = ax2, color = 'gold')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 10', fontsize = 15)
# Checking outliers in parameter 10
outlier_cols9 = []
Q1 = df['Parameter 10'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 10'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para10 = Q1 - 1.5 * IQR # Lower range bound
UTV_para10 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 10 <', LTV_para10, 'and >', UTV_para10, 'are outliers')
print('Number of outliers in the parameter 10 column below the lower whisker =', df[df['Parameter 10'] < (Q1 - (1.5*IQR))]['Parameter 10'].count())
print('Number of outliers in the parameter 10 column below the upper whisker =', df[df['Parameter 10'] > (Q3 + (1.5*IQR))]['Parameter 10'].count())
outlier_cols9.append('Parameter 10')
upperLowerBound_Disct = {'Parameter 10' : UTV_para10}
Observation :
We can observe from the outlier analysis above that we have a total of 59 outliers in "Parameter 10" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of parameter 11
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Parameter 11'], ax = ax1, color = 'white')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Parameter 11', fontsize = 15)
sns.boxplot(df['Parameter 11'], ax = ax2, color = 'white')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Parameter 11', fontsize = 15)
# Checking outliers in parameter 11
outlier_cols10 = []
Q1 = df['Parameter 11'].quantile(0.25) # 1st quartile
Q3 = df['Parameter 11'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para11 = Q1 - 1.5 * IQR # Lower range bound
UTV_para11 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Parameter 11 <', LTV_para11, 'and >', UTV_para11, 'are outliers')
print('Number of outliers in the parameter 11 column below the lower whisker =', df[df['Parameter 11'] < (Q1 - (1.5*IQR))]['Parameter 11'].count())
print('Number of outliers in the parameter 11 column below the upper whisker =', df[df['Parameter 11'] > (Q3 + (1.5*IQR))]['Parameter 11'].count())
outlier_cols10.append('Parameter 11')
upperLowerBound_Disct = {'Parameter 11' : UTV_para11}
Observation :
We can observe from the outlier analysis above that we have a total of 13 outliers in "Parameter 11" which is towards the upper whisker. We will treat them later on.
# Plotting a visual analysis of signal_strength
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13,7))
fig.set_size_inches(20,7)
sns.distplot(df['Signal_Strength'], ax = ax1, color = 'yellow')
ax1.tick_params(labelsize = 15)
ax1.set_title('Distribution Plot', fontsize = 15)
ax1.set_xlabel('Signal_Strength', fontsize = 15)
sns.boxplot(df['Signal_Strength'], ax = ax2, color = 'yellow')
ax2.set_title('Boxplot', fontsize = 15)
ax2.set_xlabel('Signal_Strength', fontsize = 15)
# Checking outliers in signal_strength
outlier_cols11 = []
Q1 = df['Signal_Strength'].quantile(0.25) # 1st quartile
Q3 = df['Signal_Strength'].quantile(0.75) # 3rd quartile
IQR = Q3 - Q1 # Interquartile range
LTV_para12 = Q1 - 1.5 * IQR # Lower range bound
UTV_para12 = Q3 + 1.5 * IQR # Upper range bound
print('Interquartile range =', IQR)
print('Signal_Strength <', LTV_para12, 'and >', UTV_para12, 'are outliers')
print('Number of outliers in the Signal_Strength column below the lower whisker =', df[df['Signal_Strength'] < (Q1 - (1.5*IQR))]['Signal_Strength'].count())
print('Number of outliers in the Signal_Strength column below the upper whisker =', df[df['Signal_Strength'] > (Q3 + (1.5*IQR))]['Signal_Strength'].count())
outlier_cols11.append('Signal_Strength')
upperLowerBound_Disct = {'Signal_Strength' : UTV_para12}
Observation :
We can observe from the outlier analysis above that we have a total of 28 outlier in "Signal_Strength", 10 towards the lower whisker and 18 towards the upper whisker. We will treat them later on.
# Pairplot visual analysis to check correlation amongst different fields
sns.pairplot(df, diag_kind = 'kde');
df.corr()
plt.figure(figsize = (18,12))
sns.heatmap(df.corr(), annot = True, fmt = 'g');
Observation : From the above pair plot & heatmap we can infer the relationship amongst the attributes and target column as follows:
------------------------ Fixing Outliers ------------------------
# Showing the columns where outliers exist
print('These are the columns which have outliers : \n\n', outlier_cols0, outlier_cols1, outlier_cols2, outlier_cols3, outlier_cols4, outlier_cols5, outlier_cols6, outlier_cols7, outlier_cols8, outlier_cols9, outlier_cols10, outlier_cols11)
df_new = df.copy()
# Treating outliers present in respective columns
for col_names in df_new.columns[:11]:
q1 = df_new[col_names].quantile(0.25)
q3 = df_new[col_names].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5*iqr
high = q3 + 1.5*iqr
df_new.loc[(df_new[col_names] < low) | (df_new[col_names] > high), col_names] = df_new[col_names].median()
plt.figure(figsize=(15,8))
sns.boxplot(data = df_new, orient='h', palette='Set1', dodge=False);
Observation :
Now we can see from the above visual analysis of boxplots that most of the outliers are replaced with their median. We can see that most of the outliers are removed, but because of their gaussians replacing it with median values, the attributes raised with new outliers which we can ignore.
df_new[['Signal_Strength']] = df_new[['Signal_Strength']].astype('category')
df_new.shape
df_new.size
df_new.head()
df_new.count(axis = 0)
df_new.dtypes
df_new.insert(df_new.shape[-1]-1,'Parameter 2,3,9 & 10',df_new['Parameter 9']/(df_new['Parameter 2'] + df_new['Parameter 3'] + df_new['Parameter 10']))
df_new.head()
df_new.insert(df_new.shape[-1]-1, 'Parameter 1 & 11', df_new['Parameter 11']/df_new['Parameter 1'])
df_new.head()
df_new.insert(df_new.shape[-1]-1,'Parameter 4,5 & 8',df_new['Parameter 4']/(df_new['Parameter 8'] + df_new['Parameter 5']))
df_new.head()
df_new.shape
df_new.size
df_new.isnull().sum()
### Splitting X-independent attributes and Y-dependent attributes and keeping the test set seperate
x = df_new.drop(['Signal_Strength'], axis = 1)
y = df_new[['Signal_Strength']]
x.apply(zscore)
pca_model = PCA(n_components = 14)
pca_model.fit(x)
plt.step(list(range(1,15)), np.cumsum(pca_model.explained_variance_ratio_), where = 'mid')
plt.ylabel('Cummulation of Variance Explained')
plt.xlabel('Eigen Values')
plt.show()
np.cumsum(pca_model.explained_variance_ratio_)
cluster = range(1,12)
mean_distortions = []
for val in cluster:
kmeans = KMeans(n_clusters = val)
kmeans.fit(df_new)
mean_distortions.append(sum(np.min(cdist(df_new, kmeans.cluster_centers_), axis = 1))/df_new.shape[0])
plt.plot(cluster, mean_distortions,'bx-')
plt.xlabel('No. Of Clusters')
plt.ylabel('Distortion')
plt.title('Elbow Method')
X_train1, X_test1, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
from sklearn.preprocessing import StandardScaler
X_train_sd = StandardScaler().fit_transform(X_train1)
X_test_sd = StandardScaler().fit_transform(X_test1)
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder().fit(y_train)
y_train = label_encoder.transform(y_train)
label_encoder = preprocessing.LabelEncoder().fit(y_test)
y_test = label_encoder.transform(y_test)
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)
print("Shape of y_train:", y_train.shape)
print("One hot encoded value of y_train:", y_train[0])
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(X_train_sd.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
# the "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
eigen_pairs = [(np.abs(e_vals[i]), e_vecs[:,i]) for i in range(len(e_vals))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:14]
# generating dimensionally reduced datasets
w = np.hstack((eigen_pairs[0][1].reshape(14,1),
eigen_pairs[1][1].reshape(14,1)))
print('Matrix W:\n', w)
X_sd_pca = X_train_sd.dot(w)
X_test_sd_pca = X_test_sd.dot(w)
X_train_sd.shape, w.shape, X_sd_pca.shape, X_test_sd_pca.shape
X_sd_pca, X_train_sd
print(X_train_sd.shape)
print(y_train.shape)
print(X_sd_pca.shape)
print(y_test.shape)
# SGD Neural Network regression model before pca
class_model = Sequential()
# Input Layer
class_model.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
sgd6 = optimizers.SGD(lr = 0.01)
class_model.compile(optimizer = sgd6, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model.summary()
his = class_model.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model12 = Sequential()
# Input Layer
model12.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model12.add(Dense(6, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model12.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model12.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
sgd7 = optimizers.SGD(lr = 0.01)
model12.compile(optimizer = sgd7, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model12.summary()
model12.fit(X_train_sd, y_train, epochs = 100)
model12.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model13 = Sequential()
# Input Layer
model13.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model13.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model13.add(Dense(20, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
# Hidden layers
model13.add(Dense(30, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model13.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model13.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
sgd8 = optimizers.SGD(lr = 0.01)
model13.compile(optimizer = sgd8, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model13.summary()
model13.fit(X_train_sd, y_train, epochs = 100)
model13.evaluate(X_train_sd, y_train)
# Adam Neural Network regression model before pca
class_model1 = Sequential()
# Input Layer
class_model1.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model1.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
adam6 = optimizers.Adam(lr = 0.01)
class_model1.compile(optimizer = adam6, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model1.summary()
his1 = class_model1.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model1.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model14 = Sequential()
# Input Layer
model14.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model14.add(Dense(20, activation ='elu', kernel_initializer = 'normal')) # 2nd layer
model14.add(Dense(40, activation ='elu', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model14.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
adam7 = optimizers.Adam(lr = 0.01)
model14.compile(optimizer = adam7, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model14.summary()
model14.fit(X_train_sd, y_train, epochs = 100)
model14.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model15 = Sequential()
# Input Layer
model15.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model15.add(Dense(25, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model15.add(Dense(10, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
model15.add(Dense(15, activation='tanh', kernel_initializer = 'normal')) # 4th layer
model15.add(Dense(25, activation='tanh', kernel_initializer = 'normal')) # 5th layer
#Output layer
model15.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
adam8 = optimizers.Adam(lr = 0.01)
model15.compile(optimizer = adam8, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model15.summary()
model15.fit(X_train_sd, y_train, epochs = 100)
model15.evaluate(X_train_sd, y_train)
# Rmsprop Neural Network regression model before pca
class_model2 = Sequential()
# Input Layer
class_model2.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model2.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
rms6 = optimizers.RMSprop(lr = 0.01)
class_model2.compile(optimizer = rms6, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model2.summary()
his2 = class_model2.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model2.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model16 = Sequential()
# Input Layer
model16.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model16.add(Dense(14, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model16.add(Dense(28, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model16.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
rms7 = optimizers.RMSprop(lr = 0.01)
model16.compile(optimizer = rms7, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model16.summary()
model16.fit(X_train_sd, y_train, epochs = 100)
model16.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model17 = Sequential()
# Input Layer
model17.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model17.add(Dense(50, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model17.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model17.add(Dense(30, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model17.add(Dense(60, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model17.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
rms8 = optimizers.RMSprop(lr = 0.01)
model17.compile(optimizer = rms8, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model17.summary()
model17.fit(X_train_sd, y_train, epochs = 100)
model17.evaluate(X_train_sd, y_train)
# SGD Neural Network regression model after pca
class_model3 = Sequential()
# Input Layer
class_model3.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model3.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
sgd9 = optimizers.SGD(lr = 0.01)
class_model3.compile(optimizer = sgd9, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model3.summary()
his3 = class_model3.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model3.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model18 = Sequential()
# Input Layer
model18.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model18.add(Dense(6, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model18.add(Dense(5, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model18.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
sgd10 = optimizers.SGD(lr = 0.01)
model18.compile(optimizer = sgd10, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model18.summary()
model18.fit(X_train_sd, y_train, epochs = 100)
model18.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model19 = Sequential()
# Input Layer
model19.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model19.add(Dense(10, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model19.add(Dense(20, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
# Hidden layers
model19.add(Dense(30, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model19.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model19.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
sgd11 = optimizers.SGD(lr = 0.01)
model19.compile(optimizer = sgd11, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model19.summary()
model19.fit(X_train_sd, y_train, epochs = 100)
model19.evaluate(X_train_sd, y_train)
# Adam Neural Network regression model after pca
class_model4 = Sequential()
# Input Layer
class_model4.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model4.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
adam9 = optimizers.Adam(lr = 0.01)
class_model4.compile(optimizer = adam9, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model4.summary()
his4 = class_model4.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model4.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model20 = Sequential()
# Input Layer
model20.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model20.add(Dense(16, activation ='elu', kernel_initializer = 'normal')) # 2nd layer
model20.add(Dense(8, activation ='elu', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model20.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
adam10 = optimizers.Adam(lr = 0.01)
model20.compile(optimizer = adam10, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model20.summary()
model20.fit(X_train_sd, y_train, epochs = 100)
model20.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model21 = Sequential()
# Input Layer
model21.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model21.add(Dense(45, activation='tanh', kernel_initializer = 'normal')) # 2nd layer
model21.add(Dense(30, activation='tanh', kernel_initializer = 'normal')) # 3rd layer
model21.add(Dense(10, activation='tanh', kernel_initializer = 'normal')) # 4th layer
model21.add(Dense(50, activation='tanh', kernel_initializer = 'normal')) # 5th layer
#Output layer
model21.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
adam11 = optimizers.Adam(lr = 0.01)
model21.compile(optimizer = adam11, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model21.summary()
model21.fit(X_train_sd, y_train, epochs = 100)
model21.evaluate(X_train_sd, y_train)
# Rmsprop Neural Network regression model after pca
class_model5 = Sequential()
# Input Layer
class_model5.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Output Layer
class_model5.add(Dense(10, kernel_initializer = 'normal', activation = 'softmax'))
rms9 = optimizers.RMSprop(lr = 0.01)
class_model5.compile(optimizer = rms9, loss = 'categorical_crossentropy', metrics = ['accuracy'])
class_model5.summary()
his5 = class_model5.fit(X_train_sd, y_train, epochs = 100, verbose = 1)
class_model5.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model22 = Sequential()
# Input Layer
model22.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding two Hidden layers
model22.add(Dense(35, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model22.add(Dense(15, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
#Output layer
model22.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
rms10 = optimizers.RMSprop(lr = 0.01)
model22.compile(optimizer = rms10, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model22.summary()
model22.fit(X_train_sd, y_train, epochs = 100)
model22.evaluate(X_train_sd, y_train)
# Initialize Sequential model
model23 = Sequential()
# Input Layer
model23.add(Dense (9, input_dim = 14, kernel_initializer = 'normal', activation = 'relu'))
# Adding four Hidden layers
model23.add(Dense(20, activation='sigmoid', kernel_initializer = 'normal')) # 2nd layer
model23.add(Dense(55, activation='sigmoid', kernel_initializer = 'normal')) # 3rd layer
model23.add(Dense(70, activation='sigmoid', kernel_initializer = 'normal')) # 4th layer
model23.add(Dense(50, activation='sigmoid', kernel_initializer = 'normal')) # 5th layer
#Output layer
model23.add(Dense(10, activation='softmax', kernel_initializer = 'normal'))
rms11 = optimizers.RMSprop(lr = 0.01)
model23.compile(optimizer = rms11, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model23.summary()
model23.fit(X_train_sd, y_train, epochs = 100)
model23.evaluate(X_train_sd, y_train)
Observation: Our best scores for regression & classification are highligted below.
// Model Scores Regression :-
====> Regression Before PCA :
SGD - loss: 2.0075 - mean_absolute_error: 1.2159
SGD Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
SGD Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
Adam - loss: 2.0075 - mean_absolute_error: 1.2159
Adam Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
Adam Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
====> Regression After PCA :
SGD - loss: 2.0075 - mean_absolute_error: 1.2159
SGD Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
SGD Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
Adam - loss: 2.0075 - mean_absolute_error: 1.2159
Adam Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
Adam Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop Two Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
RMSprop Four Hidden Layer - loss: 2.0075 - mean_absolute_error: 1.2159
// Model Scores Classification :-
====> Classification Before PCA :
SGD - loss: 0.9575 - accuracy: 0.6162
SGD Two Hidden Layer - loss: 1.2038 - accuracy: 0.4332
SGD Four Hidden Layer - loss: 1.1859 - accuracy: 0.4332
Adam - loss: 0.8157 - accuracy: 0.6583
Adam Two Hidden Layer - loss: 0.5889 - accuracy: 0.7479
Adam Four Hidden Layer - loss: 0.8692 - accuracy: 0.6218
RMSprop - loss: 0.8127 - accuracy: 0.6676
RMSprop Two Hidden Layer - loss: 0.7937 - accuracy: 0.6863
RMSprop Four Hidden Layer - loss: 0.7640 - accuracy: 0.7171
====> Classification After PCA :
SGD - loss: 0.9641 - accuracy: 0.6041
SGD Two Hidden Layer - loss: 1.2091 - accuracy: 0.4332
SGD Four Hidden Layer - loss: 1.1861 - accuracy: 0.4332
Adam - loss: 0.8262 - accuracy: 0.6489
Adam Two Hidden Layer - loss: 0.6423 - accuracy: 0.7404
Adam Four Hidden Layer - loss: 0.8663 - accuracy: 0.6676
RMSprop - loss: 0.8202 - accuracy: 0.6452
RMSprop Two Hidden Layer - loss: 0.7830 - accuracy: 0.6872
RMSprop Four Hidden Layer - loss: 0.8274 - accuracy: 0.6713
# BUILDING A GRAPHICAL USER INTERFACE
from tkinter import *
window = Tk()
#click function
def click():
entered_text=textbox.get()
textbox1.delete(0.0,END)
try:
definition=my_compdict[entered_text]
except:
definition="Sorry"
textbox.insert(END,definition)
#Exit_Button_Function
def close_window():
top.destroy()
exit()
window.title('Neural Network Project')
#Label 1
Label (window, text = "Setp 1 : File Name : ", bg = 'gray',fg = 'white').grid(row = 1,column = 1,sticky = W)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 1,column = 2,sticky = E)
#Button
Button(window, text = "Import Data", width = 10,command = click).grid(row = 1,column = 6,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 1,column = 9,sticky = E)
#Label 2
Label (window, text = "Setp 2 : Target Column : ", bg = 'gray',fg = 'white').grid(row = 2,column = 1,sticky = W)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 2,column = 2,sticky = E)
#Button
Button(window, text = "Import Target", width = 11,command = click).grid(row = 2,column = 6,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 2,column = 9,sticky = E)
#Label 3
Label (window, text = "Setp 3 : Neural Network Regressor ", bg = 'gray',fg = 'white').grid(row = 3,column = 1,sticky = W)
#Label 4
Label (window, text = " Regression ", bg = 'gray',fg = 'white').grid(row = 4,column = 1,sticky = W)
#Button
Button(window, text = "Train", width = 6,command = click).grid(row = 4,column = 2,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 4,column = 6,sticky = E)
#Label 5
Label (window, text = " Pickle ", bg = 'gray',fg = 'white').grid(row = 5,column = 1,sticky = W)
#Button
Button(window, text = "Run", width = 6,command = click).grid(row = 5,column = 2,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 5,column = 6,sticky = E)
#Label 6
Label (window, text = "Setp 4 : Neural Network Classifier ", bg = 'gray',fg = 'white').grid(row = 6,column = 1,sticky = W)
#Label 7
Label (window, text = " Classifier ", bg = 'gray',fg = 'white').grid(row = 7,column = 1,sticky = W)
#Button
Button(window, text = "Train", width = 6,command = click).grid(row = 7,column = 2,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 7,column = 6,sticky = E)
#Button
Button(window, text = "Run", width = 6,command = click).grid(row = 8,column = 2,sticky = E)
#Textbox
textbox = Entry(window, width = 14, bg = "white")
textbox.grid(row = 8,column = 6,sticky = E)
window.mainloop()
# Importing the necessary file
import h5py # Importing the h5py library
from sklearn.preprocessing import OneHotEncoder # Importing necessary library
from sklearn.metrics import confusion_matrix # Importing confusion matrix
from skimage.color import rgb2gray # Importing rgb color library
with h5py.File('SVHN_single.h5', 'r') as h5file:
ls = list(h5file.keys())
print('List of datasets in this file: \n', ls)
X_train = h5file.get('X_train')
X_test = h5file.get('X_test')
X_val = h5file.get('X_val')
y_train = h5file.get('y_train')
y_test = h5file.get('y_test')
y_val = h5file.get('y_val')
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)
# Analysing the shape of the data
print('Training set :', X_train.shape, y_train.shape)
print('Test set :', X_test.shape, y_test.shape)
print('Validation Set :', X_val.shape, y_val.shape)
# Merging validation set into training set prior data splitting
X_train = np.concatenate((X_train, X_val[:60000]))
y_train = np.concatenate((y_train, y_val[:60000]))
del X_val, y_val
# Shape of data after merging
print('Training set :', X_train.shape, y_train.shape)
print('Test set :', X_test.shape, y_test.shape)
# Function to plot sample images
def plot_images(images, labels, num_row = 2, num_col = 5):
plt.rcParams['axes.grid'] = False
fig, axes = plt.subplots(num_row, num_col, figsize = (2*num_col, 2*num_row))
for i in range(num_row * num_col):
ax = axes[i//num_col, i%num_col]
ax.imshow(images[i], cmap = 'gray')
ax.set_title(labels[i], weight = 'bold', fontsize = 20)
plt.tight_layout
plot_images(X_train, y_train)
plot_images(X_test, y_test)
# Function to plot distribution of data
def plot_distribution(y1, y2, title1, title2):
plt.rcParams['axes.facecolor'] = '#E6E6E6'
plt.rcParams['axes.grid'] = True
plt.rcParams['axes.axisbelow'] = True
plt.rcParams['grid.color'] = 'w'
plt.rcParams['figure.figsize'] = (12, 4)
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
fig.suptitle('Class Distribution', fontsize=15, fontweight='bold', y=1.05)
ax1.bar(np.arange(10),np.bincount(y1))
ax1.set_title(title1)
ax1.set_xlim(-0.5, 9.5)
ax1.set_xticks(np.arange(10))
ax2.bar(np.arange(10),np.bincount(y2),color='coral')
ax2.set_title(title2)
fig.tight_layout()
# Plotting class distribution of training set and test set
plot_distribution(y_train, y_test, "Training set", "Test set")
# Splitting train set into train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)
# Plotting class distribution of training set and validation set
plot_distribution(y_train, y_val, "Training set", "Validation set")
# Calculate the mean and the std on the training dataset
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)
Normalize images :
Normalization refers to normalizing the data dimensions so that they are of approximately the same scale. Divide each dimension by its standard deviation, once it has been zero-centered. alt text
X_train = (X_train - train_mean) / train_std
X_test = (X_test - train_mean) / train_std
X_val = (X_val - train_mean) / train_std
plot_images(X_train, y_train)
# Fit the OneHotEncoder
enc = OneHotEncoder().fit(y_train.reshape(-1, 1))
# Transform the label values to a one-hot-encoding scheme (ready for CNN)
y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
y_test = enc.transform(y_test.reshape(-1, 1)).toarray()
y_val = enc.transform(y_val.reshape(-1, 1)).toarray()
# Y shapes after OneHotEncoding
print("Training set", y_train.shape)
print("Validation set", y_val.shape)
print("Test set", y_test.shape)
# Reshape X from 3 dimensions to 4 dimensions (ready for CNN)
X_train = X_train.reshape(-1,32,32,1)
X_test = X_test.reshape(-1,32,32,1)
X_val = X_val.reshape(-1,32,32,1)
In order to get more robust results out of our model, we are going to augment the images in the dataset, by randomly rotating them, zooming them in and out, shifting them up and down (IMPORTANT NOTE: It is best that we do not shift them horizontally, since there are also distracting digits in the images), shifting their channels and shearing them.
datagen = ImageDataGenerator(rotation_range=8,
zoom_range=[0.95, 1.05],
height_shift_range=0.10,
shear_range=0.15)
# Define CNN model
keras.backend.clear_session()
model = keras.Sequential([
keras.layers.Conv2D(32, (3, 3), padding='same',
activation='relu',
input_shape=(32, 32, 1)),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(32, (3, 3), padding='same',
activation='relu'),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Dropout(0.3),
keras.layers.Conv2D(64, (3, 3), padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(64, (3, 3), padding='same',
activation='relu'),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Dropout(0.3),
keras.layers.Conv2D(128, (3, 3), padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(128, (3, 3), padding='same',
activation='relu'),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Dropout(0.3),
keras.layers.Flatten(),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dropout(0.4),
keras.layers.Dense(10, activation='softmax')
])
early_stopping = tensorflow.keras.callbacks.EarlyStopping(patience=8)
optimizer = tensorflow.keras.optimizers.Adam(amsgrad=True)
model_checkpoint = tensorflow.keras.callbacks.ModelCheckpoint('best_cnn.h5',
save_best_only=True)
model.compile(optimizer=optimizer,
loss='categorical_crossentropy',
metrics=['accuracy'])
In order to determine a good learning rate for the optimizer of our model (here, we use the AMSGrad variant of the Adam optimizer), we set a callback in an auxillary model which will gradually increase the learning rate of the optimizer.
model.summary()
# Fit model in order to determine best learning rate
history = model.fit_generator(datagen.flow(X_train, y_train, batch_size = 256),
epochs = 50, validation_data = (X_val, y_val),
callbacks = [early_stopping, model_checkpoint])
# Evaluate train and validation accuracies and losses
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']
# Visualize epochs vs. train and validation accuracies and losses
plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(train_acc, label = 'Training Accuracy')
plt.plot(val_acc, label = 'Validation Accuracy')
plt.legend()
plt.title('Epochs vs. Training and Validation Accuracy')
plt.subplot(1, 2, 2)
plt.plot(train_loss, label = 'Training Loss')
plt.plot(val_loss, label = 'Validation Loss')
plt.legend()
plt.title('Epochs vs. Training and Validation Loss')
plt.show()
# Evaluate model on test set
test_loss, test_acc = model.evaluate(x = X_test, y = y_test, verbose=0)
print('Test accuracy is: {:0.4f} \nTest loss is: {:0.4f}'.
format(test_acc, test_loss))
# Get predictions and apply inverse transformation to the labels
y_pred = model.predict(X_train)
y_pred = enc.inverse_transform(y_pred)
y_train = enc.inverse_transform(y_train)
# Plot the confusion matrix for training set
plt.figure(dpi=300)
cm = confusion_matrix(y_train, y_pred)
plt.title('Confusion matrix for training set', weight='bold')
sns.heatmap(cm,annot=True,fmt='g',cmap='coolwarm',annot_kws={"size": 12})
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()